import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import tree
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, SCORERS, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from imblearn.under_sampling import ClusterCentroids
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
CONTEXT: A telecom company wants to use their historical customer data to predict behaviour to retain customers. You can
analyse all relevant customer data and develop focused customer retention programs.
1. Import and warehouse data:
# importing all the datasets given
df1= pd.read_csv('TelcomCustomer-Churn_1.csv')
df2 = pd.read_csv('TelcomCustomer-Churn_2.csv')
df= pd.read_csv('TelcomCustomer-Churn.csv')
# Function to define the dataset
def explain_df(d):
print('The shape of the dataset:')
print(d.shape, '\n')
print('The size of the dataset:')
print(d.size, '\n')
print('the dimension of the dataset')
print(d.ndim, '\n')
print('Information about the dataset:')
print(d.info(), '\n')
print('5 point summary of the dataset:')
print(d.describe(), '\n')
print('Looking for null values:')
print(d.isnull().any(), '\n')
print('Checking for duplicates in the dataset:')
print(df.duplicated().any(), '\n')
# function to return the rows if mutiple records found
def ret_mutiple(df):
counts = df['customerID'].value_counts()
df= df[df['customerID'].isin(counts.index[counts > 1])]
return df
# Function to count the values present for each feature
def value_c(d):
print('Checking the value counts:')
for feature in d.columns: # Loop through all columns in the dataframe
if d[feature].dtype == 'object':
print(d[feature].value_counts(), '\n')
Let's see how each dataset looks
# First 5 values of df1
df1.head()
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No |
| 1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes |
| 2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes |
| 3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes |
| 4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No |
# First 5 values of df1
df2.head()
| customerID | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | 5575-GNVDE | No | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | 3668-QPYBK | Yes | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | 7795-CFOCW | No | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | 9237-HQITU | No | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
# First 5 values of df
df.head()
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
5 rows × 21 columns
After looking at the data it seems we already have merged data set with us
# Confirming is df is the merged data set available to us
print('Size of 1st dataset with repeated customerID(if any) : ',ret_mutiple(df1).size)
print('Size of 2nd dataset with repeated customerID(if any) : ',ret_mutiple(df2).size)
print('Size of already mergerd dataset with repeated customerID(if any) : ',ret_mutiple(df).size)
Size of 1st dataset with repeated customerID(if any) : 0 Size of 2nd dataset with repeated customerID(if any) : 0 Size of already mergerd dataset with repeated customerID(if any) : 0
There is no repeated column present in the dataset and each record is a present in the merged dataset
Let's explain each datasets now :
# Checking the details of df1
explain_df(df1)
The shape of the dataset:
(7043, 10)
The size of the dataset:
70430
the dimension of the dataset
2
Information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 customerID 7043 non-null object
1 gender 7043 non-null object
2 SeniorCitizen 7043 non-null int64
3 Partner 7043 non-null object
4 Dependents 7043 non-null object
5 tenure 7043 non-null int64
6 PhoneService 7043 non-null object
7 MultipleLines 7043 non-null object
8 InternetService 7043 non-null object
9 OnlineSecurity 7043 non-null object
dtypes: int64(2), object(8)
memory usage: 550.4+ KB
None
5 point summary of the dataset:
SeniorCitizen tenure
count 7043.000000 7043.000000
mean 0.162147 32.371149
std 0.368612 24.559481
min 0.000000 0.000000
25% 0.000000 9.000000
50% 0.000000 29.000000
75% 0.000000 55.000000
max 1.000000 72.000000
Looking for null values:
customerID False
gender False
SeniorCitizen False
Partner False
Dependents False
tenure False
PhoneService False
MultipleLines False
InternetService False
OnlineSecurity False
dtype: bool
Checking for duplicates in the dataset:
False
# Checking the details of df2
explain_df(df2)
The shape of the dataset:
(7043, 12)
The size of the dataset:
84516
the dimension of the dataset
2
Information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 customerID 7043 non-null object
1 OnlineBackup 7043 non-null object
2 DeviceProtection 7043 non-null object
3 TechSupport 7043 non-null object
4 StreamingTV 7043 non-null object
5 StreamingMovies 7043 non-null object
6 Contract 7043 non-null object
7 PaperlessBilling 7043 non-null object
8 PaymentMethod 7043 non-null object
9 MonthlyCharges 7043 non-null float64
10 TotalCharges 7043 non-null object
11 Churn 7043 non-null object
dtypes: float64(1), object(11)
memory usage: 660.4+ KB
None
5 point summary of the dataset:
MonthlyCharges
count 7043.000000
mean 64.761692
std 30.090047
min 18.250000
25% 35.500000
50% 70.350000
75% 89.850000
max 118.750000
Looking for null values:
customerID False
OnlineBackup False
DeviceProtection False
TechSupport False
StreamingTV False
StreamingMovies False
Contract False
PaperlessBilling False
PaymentMethod False
MonthlyCharges False
TotalCharges False
Churn False
dtype: bool
Checking for duplicates in the dataset:
False
# Checking the details of df
explain_df(df)
The shape of the dataset:
(7043, 21)
The size of the dataset:
147903
the dimension of the dataset
2
Information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 customerID 7043 non-null object
1 gender 7043 non-null object
2 SeniorCitizen 7043 non-null int64
3 Partner 7043 non-null object
4 Dependents 7043 non-null object
5 tenure 7043 non-null int64
6 PhoneService 7043 non-null object
7 MultipleLines 7043 non-null object
8 InternetService 7043 non-null object
9 OnlineSecurity 7043 non-null object
10 OnlineBackup 7043 non-null object
11 DeviceProtection 7043 non-null object
12 TechSupport 7043 non-null object
13 StreamingTV 7043 non-null object
14 StreamingMovies 7043 non-null object
15 Contract 7043 non-null object
16 PaperlessBilling 7043 non-null object
17 PaymentMethod 7043 non-null object
18 MonthlyCharges 7043 non-null float64
19 TotalCharges 7043 non-null object
20 Churn 7043 non-null object
dtypes: float64(1), int64(2), object(18)
memory usage: 1.1+ MB
None
5 point summary of the dataset:
SeniorCitizen tenure MonthlyCharges
count 7043.000000 7043.000000 7043.000000
mean 0.162147 32.371149 64.761692
std 0.368612 24.559481 30.090047
min 0.000000 0.000000 18.250000
25% 0.000000 9.000000 35.500000
50% 0.000000 29.000000 70.350000
75% 0.000000 55.000000 89.850000
max 1.000000 72.000000 118.750000
Looking for null values:
customerID False
gender False
SeniorCitizen False
Partner False
Dependents False
tenure False
PhoneService False
MultipleLines False
InternetService False
OnlineSecurity False
OnlineBackup False
DeviceProtection False
TechSupport False
StreamingTV False
StreamingMovies False
Contract False
PaperlessBilling False
PaymentMethod False
MonthlyCharges False
TotalCharges False
Churn False
dtype: bool
Checking for duplicates in the dataset:
False
Inference
Let's confirm if we have a merged dataset with us
# Checking if all values of customerID exists in the merged dataset present with us
print('comparing 1st ds with merged:\n', df['customerID'].isin(df1['customerID']).value_counts())
print('\n')
print('comparing 2nd ds with merged:\n', df['customerID'].isin(df1['customerID']).value_counts())
comparing 1st ds with merged: True 7043 Name: customerID, dtype: int64 comparing 2nd ds with merged: True 7043 Name: customerID, dtype: int64
All the values are of customers are present in the merged data set. Hence we shall continue our process with the merged dataset only
2. Data cleansing:
# Since SeniorCitizen column has binary value present, Hence subsituting it with relevant values
repl= { 'SeniorCitizen' : {0:'No', 1: 'Yes'}}
df= df.replace(repl)
df.head()
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Female | No | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | 5575-GNVDE | Male | No | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | 3668-QPYBK | Male | No | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | 7795-CFOCW | Male | No | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | 9237-HQITU | Female | No | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
5 rows × 21 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7043 entries, 0 to 7042 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerID 7043 non-null object 1 gender 7043 non-null object 2 SeniorCitizen 7043 non-null object 3 Partner 7043 non-null object 4 Dependents 7043 non-null object 5 tenure 7043 non-null int64 6 PhoneService 7043 non-null object 7 MultipleLines 7043 non-null object 8 InternetService 7043 non-null object 9 OnlineSecurity 7043 non-null object 10 OnlineBackup 7043 non-null object 11 DeviceProtection 7043 non-null object 12 TechSupport 7043 non-null object 13 StreamingTV 7043 non-null object 14 StreamingMovies 7043 non-null object 15 Contract 7043 non-null object 16 PaperlessBilling 7043 non-null object 17 PaymentMethod 7043 non-null object 18 MonthlyCharges 7043 non-null float64 19 TotalCharges 7043 non-null object 20 Churn 7043 non-null object dtypes: float64(1), int64(1), object(19) memory usage: 1.1+ MB
# Checking if any row contains non numerical value for the column TotalCharges
df[~df.TotalCharges.str.contains(r'[0-9]')]
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 488 | 4472-LVYGI | Female | No | Yes | Yes | 0 | No | No phone service | DSL | Yes | ... | Yes | Yes | Yes | No | Two year | Yes | Bank transfer (automatic) | 52.55 | No | |
| 753 | 3115-CZMZD | Male | No | No | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 20.25 | No | |
| 936 | 5709-LVOEQ | Female | No | Yes | Yes | 0 | Yes | No | DSL | Yes | ... | Yes | No | Yes | Yes | Two year | No | Mailed check | 80.85 | No | |
| 1082 | 4367-NUYAO | Male | No | Yes | Yes | 0 | Yes | Yes | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 25.75 | No | |
| 1340 | 1371-DWPAZ | Female | No | Yes | Yes | 0 | No | No phone service | DSL | Yes | ... | Yes | Yes | Yes | No | Two year | No | Credit card (automatic) | 56.05 | No | |
| 3331 | 7644-OMVMY | Male | No | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 19.85 | No | |
| 3826 | 3213-VVOLG | Male | No | Yes | Yes | 0 | Yes | Yes | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 25.35 | No | |
| 4380 | 2520-SGTTA | Female | No | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 20.00 | No | |
| 5218 | 2923-ARZLG | Male | No | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | One year | Yes | Mailed check | 19.70 | No | |
| 6670 | 4075-WKNIU | Female | No | Yes | Yes | 0 | Yes | Yes | DSL | No | ... | Yes | Yes | Yes | No | Two year | No | Mailed check | 73.35 | No | |
| 6754 | 2775-SEFEE | Male | No | No | Yes | 0 | Yes | Yes | DSL | Yes | ... | No | Yes | No | No | Two year | Yes | Bank transfer (automatic) | 61.90 | No |
11 rows × 21 columns
It can be observed from the above that the tenure for each of the record for which we donot have the Totalcharges values has null records. Let's confirm it.
# Checking if the missing value is specific to the Tenure
df[df.tenure==0]
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 488 | 4472-LVYGI | Female | No | Yes | Yes | 0 | No | No phone service | DSL | Yes | ... | Yes | Yes | Yes | No | Two year | Yes | Bank transfer (automatic) | 52.55 | No | |
| 753 | 3115-CZMZD | Male | No | No | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 20.25 | No | |
| 936 | 5709-LVOEQ | Female | No | Yes | Yes | 0 | Yes | No | DSL | Yes | ... | Yes | No | Yes | Yes | Two year | No | Mailed check | 80.85 | No | |
| 1082 | 4367-NUYAO | Male | No | Yes | Yes | 0 | Yes | Yes | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 25.75 | No | |
| 1340 | 1371-DWPAZ | Female | No | Yes | Yes | 0 | No | No phone service | DSL | Yes | ... | Yes | Yes | Yes | No | Two year | No | Credit card (automatic) | 56.05 | No | |
| 3331 | 7644-OMVMY | Male | No | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 19.85 | No | |
| 3826 | 3213-VVOLG | Male | No | Yes | Yes | 0 | Yes | Yes | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 25.35 | No | |
| 4380 | 2520-SGTTA | Female | No | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 20.00 | No | |
| 5218 | 2923-ARZLG | Male | No | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | One year | Yes | Mailed check | 19.70 | No | |
| 6670 | 4075-WKNIU | Female | No | Yes | Yes | 0 | Yes | Yes | DSL | No | ... | Yes | Yes | Yes | No | Two year | No | Mailed check | 73.35 | No | |
| 6754 | 2775-SEFEE | Male | No | No | Yes | 0 | Yes | Yes | DSL | Yes | ... | No | Yes | No | No | Two year | Yes | Bank transfer (automatic) | 61.90 | No |
11 rows × 21 columns
# Replacing the String value of ' ' to null
df['TotalCharges']=df["TotalCharges"].replace(" ",np.nan)
# Confirming if the null values is now present in the dataset
df.isnull().any()
customerID False gender False SeniorCitizen False Partner False Dependents False tenure False PhoneService False MultipleLines False InternetService False OnlineSecurity False OnlineBackup False DeviceProtection False TechSupport False StreamingTV False StreamingMovies False Contract False PaperlessBilling False PaymentMethod False MonthlyCharges False TotalCharges True Churn False dtype: bool
# Coverting the TotalCharges column to a numerical column
df['TotalCharges']=pd.to_numeric(df['TotalCharges'],errors='coerce')
# Describing the dataset again so that Totalcharges can be included to be checked
df.describe()
| tenure | MonthlyCharges | TotalCharges | |
|---|---|---|---|
| count | 7043.000000 | 7043.000000 | 7032.000000 |
| mean | 32.371149 | 64.761692 | 2283.300441 |
| std | 24.559481 | 30.090047 | 2266.771362 |
| min | 0.000000 | 18.250000 | 18.800000 |
| 25% | 9.000000 | 35.500000 | 401.450000 |
| 50% | 29.000000 | 70.350000 | 1397.475000 |
| 75% | 55.000000 | 89.850000 | 3794.737500 |
| max | 72.000000 | 118.750000 | 8684.800000 |
# Imputing the values for nan values to have with the mean value of the total charge column
df['TotalCharges']=df["TotalCharges"].replace(np.nan,np.mean(df.TotalCharges))
# CustomerID column is unique for each record and doesnot add much value to the dataset
# so, CustomerID column is a non important feature. Hence we can drop it
df.drop(['customerID'],axis=1,inplace=True)
df.shape
(7043, 20)
# Checking the 5 point summary after imputing the values for null Totalcharges
df.describe()
| tenure | MonthlyCharges | TotalCharges | |
|---|---|---|---|
| count | 7043.000000 | 7043.000000 | 7043.000000 |
| mean | 32.371149 | 64.761692 | 2283.300441 |
| std | 24.559481 | 30.090047 | 2265.000258 |
| min | 0.000000 | 18.250000 | 18.800000 |
| 25% | 9.000000 | 35.500000 | 402.225000 |
| 50% | 29.000000 | 70.350000 | 1400.550000 |
| 75% | 55.000000 | 89.850000 | 3786.600000 |
| max | 72.000000 | 118.750000 | 8684.800000 |
#Confirming if the dataset contains any null values
df.isnull().any().any()
False
# checking the value counts for each categorical columns
value_c(df)
Checking the value counts: Male 3555 Female 3488 Name: gender, dtype: int64 No 5901 Yes 1142 Name: SeniorCitizen, dtype: int64 No 3641 Yes 3402 Name: Partner, dtype: int64 No 4933 Yes 2110 Name: Dependents, dtype: int64 Yes 6361 No 682 Name: PhoneService, dtype: int64 No 3390 Yes 2971 No phone service 682 Name: MultipleLines, dtype: int64 Fiber optic 3096 DSL 2421 No 1526 Name: InternetService, dtype: int64 No 3498 Yes 2019 No internet service 1526 Name: OnlineSecurity, dtype: int64 No 3088 Yes 2429 No internet service 1526 Name: OnlineBackup, dtype: int64 No 3095 Yes 2422 No internet service 1526 Name: DeviceProtection, dtype: int64 No 3473 Yes 2044 No internet service 1526 Name: TechSupport, dtype: int64 No 2810 Yes 2707 No internet service 1526 Name: StreamingTV, dtype: int64 No 2785 Yes 2732 No internet service 1526 Name: StreamingMovies, dtype: int64 Month-to-month 3875 Two year 1695 One year 1473 Name: Contract, dtype: int64 Yes 4171 No 2872 Name: PaperlessBilling, dtype: int64 Electronic check 2365 Mailed check 1612 Bank transfer (automatic) 1544 Credit card (automatic) 1522 Name: PaymentMethod, dtype: int64 No 5174 Yes 1869 Name: Churn, dtype: int64
# Changing the columns classified as object to categorical columns
for feature in df.columns: # Loop through all columns in the dataframe
if df[feature].dtype == 'object': # Only apply for columns with categorical strings
df[feature] = pd.Categorical(df[feature])# Replace strings with an integer
# Checking the information about the dataset after the above change to category
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7043 entries, 0 to 7042 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 7043 non-null category 1 SeniorCitizen 7043 non-null category 2 Partner 7043 non-null category 3 Dependents 7043 non-null category 4 tenure 7043 non-null int64 5 PhoneService 7043 non-null category 6 MultipleLines 7043 non-null category 7 InternetService 7043 non-null category 8 OnlineSecurity 7043 non-null category 9 OnlineBackup 7043 non-null category 10 DeviceProtection 7043 non-null category 11 TechSupport 7043 non-null category 12 StreamingTV 7043 non-null category 13 StreamingMovies 7043 non-null category 14 Contract 7043 non-null category 15 PaperlessBilling 7043 non-null category 16 PaymentMethod 7043 non-null category 17 MonthlyCharges 7043 non-null float64 18 TotalCharges 7043 non-null float64 19 Churn 7043 non-null category dtypes: category(17), float64(2), int64(1) memory usage: 283.9 KB
3. Data analysis & visualisation:
# Creating new dataframe to store the copy of the orginal dataset
df_corr= df.copy()
# Seggregating the columns based on its types
binary_cols = [] # categorical columns with binary values
multicat_cols=[] # categorical columns with multiple values
num_cols=[] #numerical cols
for f in df.drop('Churn', axis=1).columns: # removing the target variable
if df[f].dtype == 'int64' or df[f].dtype == 'float64' :
num_cols.append(f)
else:
if df[f].value_counts().shape[0] == 2:
binary_cols.append(f)
else:
multicat_cols.append(f)
# Checking the no of columns based on its types, seggreated in the previous step
print('No of Numerical Columns',len(num_cols))
print('No of Binary Columns',len(binary_cols))
print('No of Multicategory Columns',len(multicat_cols))
No of Numerical Columns 3 No of Binary Columns 6 No of Multicategory Columns 10
# Checking the correlation between the numerical columns
df.corr()
| tenure | MonthlyCharges | TotalCharges | |
|---|---|---|---|
| tenure | 1.000000 | 0.247900 | 0.824757 |
| MonthlyCharges | 0.247900 | 1.000000 | 0.650468 |
| TotalCharges | 0.824757 | 0.650468 | 1.000000 |
# plotting the graph to visualize the relation between the features
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x19e2da96cd0>
# Plotting the count plots for all the binary columns present for visualoization
fig, ax = plt.subplots(2, 3, figsize=(12, 7), sharey=True)
i=0
j=0
for col in binary_cols:
if j>=3:
i+=1
sns.countplot(col, data=df, ax=ax[i,0])
j=1
else:
sns.countplot(col, data=df, ax=ax[i,j])
j+=1
# Let's check the churn rate to understand how the features are effected wrt target
print('Churn Rate for Binary Columns:\n\n')
for col in binary_cols:
print(df.groupby(col).Churn.apply(lambda x: (x == 'Yes').mean()),'\n\n')
Churn Rate for Binary Columns: gender Female 0.269209 Male 0.261603 Name: Churn, dtype: float64 SeniorCitizen No 0.236062 Yes 0.416813 Name: Churn, dtype: float64 Partner No 0.329580 Yes 0.196649 Name: Churn, dtype: float64 Dependents No 0.312791 Yes 0.154502 Name: Churn, dtype: float64 PhoneService No 0.249267 Yes 0.267096 Name: Churn, dtype: float64 PaperlessBilling No 0.163301 Yes 0.335651 Name: Churn, dtype: float64
Checking the distribution of the numerical columns wrt to target column
sns.kdeplot(x="tenure",hue="Churn", data=df)
<AxesSubplot:xlabel='tenure', ylabel='Density'>
sns.kdeplot(data=df,x="MonthlyCharges",hue="Churn")
<AxesSubplot:xlabel='MonthlyCharges', ylabel='Density'>
sns.kdeplot(data=df,x="TotalCharges",hue="Churn")
<AxesSubplot:xlabel='TotalCharges', ylabel='Density'>
# Let's see if there is a difference in the in charges based on the gender
sns.barplot(y='TotalCharges', x='gender', hue='Churn', data=df)
<AxesSubplot:xlabel='gender', ylabel='TotalCharges'>
Male and females tend to spend the same for both of the categories: churning and non-churning
# Plotting the graph to explore about the customers who are partnered and have dependents
sns.countplot("Partner", data=df, hue = 'Dependents')
<AxesSubplot:xlabel='Partner', ylabel='count'>
# Let's check the distribution of the customers availing the online services
df['Count_OnlineServices'] =(df[['OnlineSecurity','DeviceProtection','StreamingMovies','TechSupport',
'StreamingTV', 'OnlineBackup']]=='Yes').sum(axis=1)
plt.figure(figsize=(9,6))
ax=sns.countplot(x='Count_OnlineServices',hue='Churn',data=df)
ax.set_xlabel('Number of Online Services')
ax.set_ylabel('Number of Customers')
Text(0, 0.5, 'Number of Customers')
# Let's check the impact of charges on the the attrition of customers who opted for online services
agg = df.replace('Yes',1).replace('No', 0).groupby('Count_OnlineServices', as_index=False)[['MonthlyCharges']].mean()
agg[['MonthlyCharges']] = np.round(agg[['MonthlyCharges']], 0)
plt.figure(figsize=(9,6))
ax = sns.barplot(x='Count_OnlineServices',y='MonthlyCharges',data=agg)
ax.set_xlabel('Number of Online Services Availed')
ax.set_ylabel('Average Monthly Charges')
Text(0, 0.5, 'Average Monthly Charges')
sns.pairplot(df[['tenure','MonthlyCharges','Churn']],hue='Churn');
# Plotting the distribution of multicategory columns for understanding and visualization
fig, ax = plt.subplots(5, 2, figsize=(25, 20), sharey=True)
i=0
j=0
for col in multicat_cols:
if j>=2:
i+=1
sns.countplot(col, data=df, ax=ax[i,0])
j=1
else:
sns.countplot(col, data=df, ax=ax[i,j])
j+=1
# Let's check the correlation between Total Charges incurred by the customer and the tenure for which they
# have been associated with the company
px.scatter(data_frame=df,x="TotalCharges",y="tenure",color="Churn")
Now let's check for any outliers present and see the impact it might have to the company
fig, ax =plt.subplots(1,3, figsize=(15, 5))
for i in range(len(num_cols)):
sns.boxplot( data=df,x='Churn', y=num_cols[i], ax=ax[i])
# Let's try to find out the outliers in for those churned
lq=df[df.Churn=='Yes'].TotalCharges.quantile(.25)
uq=df[df.Churn=='Yes'].TotalCharges.quantile(.75)
iqr=uq-lq
outlier = uq+(1.5 * iqr)
outlier_df = df[ (df.Churn=='Yes') & (df.TotalCharges > outlier) ]
tot_outlier = len(outlier_df.index)
outlier_avg = outlier_df.TotalCharges.mean()
outliers_sum = outlier_df.TotalCharges.sum()
print("No. of Customers who churned after paying high charges:", tot_outlier)
print("Average amount spent by such customer:",round( outlier_avg,2))
print("Loss in revenue for the company by such customers:", round(outliers_sum,2))
print("This is about",round((outlier_df.TotalCharges.sum()/df.TotalCharges.sum())*100,2),"% of total turnover that month")
No. of Customers who churned after paying high charges: 109 Average amount spent by such customer: 6670.28 Loss in revenue for the company by such customers: 727060.65 This is about 4.52 % of total turnover that month
# Checking the Churn Rate for Multicategory columns
print('Churn Rate for Multi Category Columns:\n\n')
for col in multicat_cols:
print(df.groupby(col).Churn.apply(lambda x: (x == 'Yes').mean()),'\n\n')
Churn Rate for Multi Category Columns: MultipleLines No 0.250442 No phone service 0.249267 Yes 0.286099 Name: Churn, dtype: float64 InternetService DSL 0.189591 Fiber optic 0.418928 No 0.074050 Name: Churn, dtype: float64 OnlineSecurity No 0.417667 No internet service 0.074050 Yes 0.146112 Name: Churn, dtype: float64 OnlineBackup No 0.399288 No internet service 0.074050 Yes 0.215315 Name: Churn, dtype: float64 DeviceProtection No 0.391276 No internet service 0.074050 Yes 0.225021 Name: Churn, dtype: float64 TechSupport No 0.416355 No internet service 0.074050 Yes 0.151663 Name: Churn, dtype: float64 StreamingTV No 0.335231 No internet service 0.074050 Yes 0.300702 Name: Churn, dtype: float64 StreamingMovies No 0.336804 No internet service 0.074050 Yes 0.299414 Name: Churn, dtype: float64 Contract Month-to-month 0.427097 One year 0.112695 Two year 0.028319 Name: Churn, dtype: float64 PaymentMethod Bank transfer (automatic) 0.167098 Credit card (automatic) 0.152431 Electronic check 0.452854 Mailed check 0.191067 Name: Churn, dtype: float64
# Let;s see how the Internet service is effected by the Monthly charges
sns.barplot(y='MonthlyCharges', x='InternetService', hue='Churn', data=df)
<AxesSubplot:xlabel='InternetService', ylabel='MonthlyCharges'>
The monthly charges for fiber optics is relatively higher indicating the reason of high churn rate for customers
who opted for Fiber Optics internet Service.
# Checking the relation between monthly charges and tenure wrt to the customer attrition
df[['tenure', 'MonthlyCharges', 'Churn']].groupby('Churn').mean().reset_index()
| Churn | tenure | MonthlyCharges | |
|---|---|---|---|
| 0 | No | 37.569965 | 61.265124 |
| 1 | Yes | 17.979133 | 74.441332 |
# Let's visualize how the contract and tenure related
px.bar(data_frame=df, x='Contract', y='tenure')
Lets create a copy of the existing dataframe and encode all the categorical features present and check the correlation
df_corr= df.copy()
lbe=LabelEncoder()
for col in df_corr.columns:
if df_corr[col].dtype != 'int64' or df_corr[col].dtype != 'float64':
df_corr[col]=lbe.fit_transform(df_corr[col])
# Let's confirm the correlation between tenure and contract
corr=df_corr.corr()
corr.loc['tenure', 'Contract']
0.6716065492281024
#Let's visualize the correaltion of "Churn" with other variables:
plt.figure(figsize=(15,8))
df_corr.corr()['Churn'].sort_values(ascending = False).plot(kind='bar')
<AxesSubplot:>
From all of the above we can come to the conclusion that the below mentioend features can be dropped
df.drop(['gender','PhoneService','TotalCharges'], axis=1, inplace=True)
4. Data pre-processing:
resultsDf= pd.DataFrame(columns=['Method', 'Accuracy'])
def Scale_Encode(X):
cat_cols=[]
num_cols=[]
for f in X.columns:
if X[f].dtype == 'int64' or X[f].dtype == 'float64' :
num_cols.append(f)
else:
cat_cols.append(f)
std=StandardScaler()
sc_num=std.fit_transform(X[num_cols])
sc_num=pd.DataFrame(sc_num,columns=num_cols)
for i in cat_cols:
le = LabelEncoder()
X[i] = le.fit_transform(X[i])
X=X.drop(columns=num_cols,axis=1)
X=X.merge(sc_num,left_index=True,right_index=True,how="left")
return X
def classification(model, model_name, balancing):
print('\n For ',model_name, '---->\n')
if balancing=='No':
model.fit(X_train, y_train)
y_predict= model.predict(X_test)
print('Training Accuracy: ', model.score(X_train, y_train))
print('Testing Accuracy: ', model.score(X_test, y_test))
print('ROC AUC Score: ', metrics.roc_auc_score(y_test,y_predict))
print(confusion_m(y_predict, balancing))
acc = metrics.accuracy_score(y_test, y_predict)
global resultsDf
tempResultsDf = pd.DataFrame({'Method':[model_name], 'Accuracy': [acc]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
elif balancing=='SMOTE':
model.fit(Xs_train, ys_train)
ys_predict= model.predict(Xs_test)
print('Training Accuracy: ', model.score(Xs_train, ys_train))
print('Testing Accuracy: ', model.score(Xs_test, ys_test))
print('ROC AUC Score: ', metrics.roc_auc_score(ys_test,ys_predict))
print(confusion_m(ys_predict, balancing))
acc = metrics.accuracy_score(ys_test, ys_predict)
resultsDf.loc[resultsDf.Method == model_name, 'Accuracy(SMOTE)'] = acc
else:
model.fit(Xcc_train, ycc_train)
ycc_predict= model.predict(Xcc_test)
print('Training Accuracy: ', model.score(Xcc_train, ycc_train))
print('Testing Accuracy: ', model.score(Xcc_test, ycc_test))
print('ROC AUC Score: ', metrics.roc_auc_score(ycc_test,ycc_predict))
print(confusion_m(ycc_predict, balancing))
acc = metrics.accuracy_score(ycc_test, ycc_predict)
resultsDf.loc[resultsDf.Method == model_name, 'Accuracy(CC)'] = acc
def confusion_m(p, balancing):
if balancing=='No':
print("\n Classification Report: \n", metrics.classification_report(y_test, p))
print('\n')
cm=metrics.confusion_matrix(y_test, p, labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
print('Confusion Matrics:\n',df_cm)
elif balancing=='SMOTE':
print("\n Classification Report: \n", metrics.classification_report(ys_test, p))
print('\n')
cm=metrics.confusion_matrix(ys_test, p, labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
print('Confusion Matrics:\n',df_cm)
else:
print("\n Classification Report: \n", metrics.classification_report(ycc_test, p))
print('\n')
cm=metrics.confusion_matrix(ycc_test, p, labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
print('Confusion Matrics:\n',df_cm)
return df_cm
# Segregating the predictor and the target variables
X= Scale_Encode(df)
y=df["Churn"]
X=df.drop(["Churn"],axis=1)
# Train Test Split on the available dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=1, stratify=y)
# Checking the target balancing
print(Counter(y_train))
Counter({0: 3622, 1: 1308})
We saw that the target is imbalanced hence implementing target balance
# Train Test Split on balanced target using SMOTE
osmote=SMOTE()
Xs_train,ys_train=osmote.fit_resample(X_train,y_train)
Xs_test,ys_test=osmote.fit_resample(X_test,y_test)
print(Counter(ys_train))
Counter({0: 3622, 1: 3622})
# Train Test Split on balanced target using Cluster Centriods
cc=ClusterCentroids()
Xcc_train,ycc_train =cc.fit_resample(X_train,y_train)
Xcc_test,ycc_test =cc.fit_resample(X_test,y_test)
print(Counter(ycc_train))
Counter({0: 1308, 1: 1308})
5. Model training, testing and tuning:
# Listing the weak and strong classifiers to be used for testing
classifiers = [['DecisionTree',DecisionTreeClassifier()],
['RandomForest',RandomForestClassifier()],
['Bagging', BaggingClassifier()],
['AdaBoostClassifier', AdaBoostClassifier()],
['GradientBoostingClassifier ', GradientBoostingClassifier()],
['XGB', XGBClassifier()],
['CatBoost', CatBoostClassifier(logging_level='Silent')],
['Naive Bayes', GaussianNB()],
['KNeighbours', KNeighborsClassifier()],
['SVM', SVC()],
['LogisticRegression', LogisticRegression()]
]
# Checking the accuracy of each model on the unbalanced data
for models in classifiers:
classification(models[1], models[0], 'No')
For DecisionTree ---->
Training Accuracy: 0.9969574036511156
Testing Accuracy: 0.7236157122574538
ROC AUC Score: 0.6485237839278166
Classification Report:
precision recall f1-score support
0 0.81 0.81 0.81 1552
1 0.48 0.49 0.48 561
accuracy 0.72 2113
macro avg 0.65 0.65 0.65 2113
weighted avg 0.73 0.72 0.72 2113
Confusion Matrics:
No Yes
No 1255 297
Yes 287 274
No Yes
No 1255 297
Yes 287 274
For RandomForest ---->
Training Accuracy: 0.9969574036511156
Testing Accuracy: 0.7870326549929011
ROC AUC Score: 0.6934011889666831
Classification Report:
precision recall f1-score support
0 0.83 0.89 0.86 1552
1 0.63 0.49 0.55 561
accuracy 0.79 2113
macro avg 0.73 0.69 0.71 2113
weighted avg 0.78 0.79 0.78 2113
Confusion Matrics:
No Yes
No 1386 166
Yes 284 277
No Yes
No 1386 166
Yes 284 277
For Bagging ---->
Training Accuracy: 0.9789046653144016
Testing Accuracy: 0.7681022243256034
ROC AUC Score: 0.6662870748111802
Classification Report:
precision recall f1-score support
0 0.82 0.88 0.85 1552
1 0.58 0.45 0.51 561
accuracy 0.77 2113
macro avg 0.70 0.67 0.68 2113
weighted avg 0.75 0.77 0.76 2113
Confusion Matrics:
No Yes
No 1371 181
Yes 309 252
No Yes
No 1371 181
Yes 309 252
For AdaBoostClassifier ---->
Training Accuracy: 0.8058823529411765
Testing Accuracy: 0.8007572172266919
ROC AUC Score: 0.7169714886891965
Classification Report:
precision recall f1-score support
0 0.84 0.90 0.87 1552
1 0.65 0.54 0.59 561
accuracy 0.80 2113
macro avg 0.75 0.72 0.73 2113
weighted avg 0.79 0.80 0.79 2113
Confusion Matrics:
No Yes
No 1390 162
Yes 259 302
No Yes
No 1390 162
Yes 259 302
For GradientBoostingClassifier ---->
Training Accuracy: 0.8204868154158215
Testing Accuracy: 0.8097491717936584
ROC AUC Score: 0.717970716871566
Classification Report:
precision recall f1-score support
0 0.84 0.91 0.88 1552
1 0.69 0.52 0.59 561
accuracy 0.81 2113
macro avg 0.76 0.72 0.73 2113
weighted avg 0.80 0.81 0.80 2113
Confusion Matrics:
No Yes
No 1418 134
Yes 268 293
No Yes
No 1418 134
Yes 268 293
For XGB ---->
[22:41:45] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Training Accuracy: 0.9320486815415822
Testing Accuracy: 0.7841930903928065
ROC AUC Score: 0.7005738096550711
Classification Report:
precision recall f1-score support
0 0.84 0.88 0.86 1552
1 0.61 0.52 0.56 561
accuracy 0.78 2113
macro avg 0.72 0.70 0.71 2113
weighted avg 0.78 0.78 0.78 2113
Confusion Matrics:
No Yes
No 1364 188
Yes 268 293
No Yes
No 1364 188
Yes 268 293
For CatBoost ---->
Training Accuracy: 0.8685598377281947
Testing Accuracy: 0.8017037387600567
ROC AUC Score: 0.7107866108017715
Classification Report:
precision recall f1-score support
0 0.84 0.90 0.87 1552
1 0.66 0.52 0.58 561
accuracy 0.80 2113
macro avg 0.75 0.71 0.73 2113
weighted avg 0.79 0.80 0.79 2113
Confusion Matrics:
No Yes
No 1404 148
Yes 271 290
No Yes
No 1404 148
Yes 271 290
For Naive Bayes ---->
Training Accuracy: 0.7498985801217039
Testing Accuracy: 0.7524846190250828
ROC AUC Score: 0.7512645404928606
Classification Report:
precision recall f1-score support
0 0.89 0.75 0.82 1552
1 0.52 0.75 0.62 561
accuracy 0.75 2113
macro avg 0.71 0.75 0.72 2113
weighted avg 0.79 0.75 0.76 2113
Confusion Matrics:
No Yes
No 1170 382
Yes 141 420
No Yes
No 1170 382
Yes 141 420
For KNeighbours ---->
Training Accuracy: 0.8373225152129817
Testing Accuracy: 0.7747278750591576
ROC AUC Score: 0.6895777054964443
Classification Report:
precision recall f1-score support
0 0.83 0.87 0.85 1552
1 0.59 0.51 0.54 561
accuracy 0.77 2113
macro avg 0.71 0.69 0.70 2113
weighted avg 0.77 0.77 0.77 2113
Confusion Matrics:
No Yes
No 1352 200
Yes 276 285
No Yes
No 1352 200
Yes 276 285
For SVM ---->
Training Accuracy: 0.7906693711967545
Testing Accuracy: 0.792238523426408
ROC AUC Score: 0.6633680651267068
Classification Report:
precision recall f1-score support
0 0.81 0.94 0.87 1552
1 0.69 0.39 0.50 561
accuracy 0.79 2113
macro avg 0.75 0.66 0.68 2113
weighted avg 0.78 0.79 0.77 2113
Confusion Matrics:
No Yes
No 1456 96
Yes 343 218
No Yes
No 1456 96
Yes 343 218
For LogisticRegression ---->
Training Accuracy: 0.797971602434077
Testing Accuracy: 0.8050165641268339
ROC AUC Score: 0.7187327719278901
Classification Report:
precision recall f1-score support
0 0.84 0.90 0.87 1552
1 0.67 0.53 0.59 561
accuracy 0.81 2113
macro avg 0.75 0.72 0.73 2113
weighted avg 0.80 0.81 0.80 2113
Confusion Matrics:
No Yes
No 1401 151
Yes 261 300
No Yes
No 1401 151
Yes 261 300
# Lets check the accuracy of each model stored in the resultsDf
resultsDf
| Method | Accuracy | |
|---|---|---|
| 0 | DecisionTree | 0.723616 |
| 0 | RandomForest | 0.787033 |
| 0 | Bagging | 0.768102 |
| 0 | AdaBoostClassifier | 0.800757 |
| 0 | GradientBoostingClassifier | 0.809749 |
| 0 | XGB | 0.784193 |
| 0 | CatBoost | 0.801704 |
| 0 | Naive Bayes | 0.752485 |
| 0 | KNeighbours | 0.774728 |
| 0 | SVM | 0.792239 |
| 0 | LogisticRegression | 0.805017 |
# Checking the accuracy of each model where the target data is balanced using SMOTE
for models in classifiers:
classification(models[1], models[0], 'SMOTE')
For DecisionTree ---->
Training Accuracy: 0.9977912755383765
Testing Accuracy: 0.7416237113402062
ROC AUC Score: 0.7416237113402062
Classification Report:
precision recall f1-score support
0 0.72 0.79 0.75 1552
1 0.77 0.69 0.73 1552
accuracy 0.74 3104
macro avg 0.74 0.74 0.74 3104
weighted avg 0.74 0.74 0.74 3104
Confusion Matrics:
No Yes
No 1233 319
Yes 483 1069
No Yes
No 1233 319
Yes 483 1069
For RandomForest ---->
Training Accuracy: 0.9977912755383765
Testing Accuracy: 0.7931701030927835
ROC AUC Score: 0.7931701030927835
Classification Report:
precision recall f1-score support
0 0.77 0.83 0.80 1552
1 0.82 0.76 0.79 1552
accuracy 0.79 3104
macro avg 0.79 0.79 0.79 3104
weighted avg 0.79 0.79 0.79 3104
Confusion Matrics:
No Yes
No 1287 265
Yes 377 1175
No Yes
No 1287 265
Yes 377 1175
For Bagging ---->
Training Accuracy: 0.9896466040861402
Testing Accuracy: 0.7702963917525774
ROC AUC Score: 0.7702963917525775
Classification Report:
precision recall f1-score support
0 0.74 0.84 0.78 1552
1 0.81 0.70 0.75 1552
accuracy 0.77 3104
macro avg 0.78 0.77 0.77 3104
weighted avg 0.78 0.77 0.77 3104
Confusion Matrics:
No Yes
No 1297 255
Yes 458 1094
No Yes
No 1297 255
Yes 458 1094
For AdaBoostClassifier ---->
Training Accuracy: 0.8155715074544451
Testing Accuracy: 0.8041237113402062
ROC AUC Score: 0.8041237113402061
Classification Report:
precision recall f1-score support
0 0.83 0.76 0.80 1552
1 0.78 0.85 0.81 1552
accuracy 0.80 3104
macro avg 0.81 0.80 0.80 3104
weighted avg 0.81 0.80 0.80 3104
Confusion Matrics:
No Yes
No 1181 371
Yes 237 1315
No Yes
No 1181 371
Yes 237 1315
For GradientBoostingClassifier ---->
Training Accuracy: 0.8368304803975704
Testing Accuracy: 0.8089561855670103
ROC AUC Score: 0.8089561855670103
Classification Report:
precision recall f1-score support
0 0.83 0.78 0.80 1552
1 0.79 0.84 0.81 1552
accuracy 0.81 3104
macro avg 0.81 0.81 0.81 3104
weighted avg 0.81 0.81 0.81 3104
Confusion Matrics:
No Yes
No 1211 341
Yes 252 1300
No Yes
No 1211 341
Yes 252 1300
For XGB ---->
[22:42:07] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Training Accuracy: 0.949337382661513
Testing Accuracy: 0.8031572164948454
ROC AUC Score: 0.8031572164948453
Classification Report:
precision recall f1-score support
0 0.79 0.82 0.81 1552
1 0.81 0.79 0.80 1552
accuracy 0.80 3104
macro avg 0.80 0.80 0.80 3104
weighted avg 0.80 0.80 0.80 3104
Confusion Matrics:
No Yes
No 1271 281
Yes 330 1222
No Yes
No 1271 281
Yes 330 1222
For CatBoost ---->
Training Accuracy: 0.8993649917172832
Testing Accuracy: 0.8195876288659794
ROC AUC Score: 0.8195876288659794
Classification Report:
precision recall f1-score support
0 0.82 0.82 0.82 1552
1 0.82 0.82 0.82 1552
accuracy 0.82 3104
macro avg 0.82 0.82 0.82 3104
weighted avg 0.82 0.82 0.82 3104
Confusion Matrics:
No Yes
No 1268 284
Yes 276 1276
No Yes
No 1268 284
Yes 276 1276
For Naive Bayes ---->
Training Accuracy: 0.7755383765875207
Testing Accuracy: 0.7831829896907216
ROC AUC Score: 0.7831829896907216
Classification Report:
precision recall f1-score support
0 0.81 0.74 0.77 1552
1 0.76 0.82 0.79 1552
accuracy 0.78 3104
macro avg 0.79 0.78 0.78 3104
weighted avg 0.79 0.78 0.78 3104
Confusion Matrics:
No Yes
No 1151 401
Yes 272 1280
No Yes
No 1151 401
Yes 272 1280
For KNeighbours ---->
Training Accuracy: 0.8658199889563777
Testing Accuracy: 0.7364690721649485
ROC AUC Score: 0.7364690721649484
Classification Report:
precision recall f1-score support
0 0.75 0.71 0.73 1552
1 0.72 0.77 0.74 1552
accuracy 0.74 3104
macro avg 0.74 0.74 0.74 3104
weighted avg 0.74 0.74 0.74 3104
Confusion Matrics:
No Yes
No 1096 456
Yes 362 1190
No Yes
No 1096 456
Yes 362 1190
For SVM ---->
Training Accuracy: 0.7729155162893429
Testing Accuracy: 0.7577319587628866
ROC AUC Score: 0.7577319587628867
Classification Report:
precision recall f1-score support
0 0.78 0.71 0.75 1552
1 0.74 0.80 0.77 1552
accuracy 0.76 3104
macro avg 0.76 0.76 0.76 3104
weighted avg 0.76 0.76 0.76 3104
Confusion Matrics:
No Yes
No 1104 448
Yes 304 1248
No Yes
No 1104 448
Yes 304 1248
For LogisticRegression ---->
Training Accuracy: 0.7901711761457758
Testing Accuracy: 0.7893041237113402
ROC AUC Score: 0.7893041237113402
Classification Report:
precision recall f1-score support
0 0.81 0.75 0.78 1552
1 0.77 0.82 0.80 1552
accuracy 0.79 3104
macro avg 0.79 0.79 0.79 3104
weighted avg 0.79 0.79 0.79 3104
Confusion Matrics:
No Yes
No 1171 381
Yes 273 1279
No Yes
No 1171 381
Yes 273 1279
# Lets check the accuracy of each model stored in the resultsDf
resultsDf
| Method | Accuracy | Accuracy(SMOTE) | |
|---|---|---|---|
| 0 | DecisionTree | 0.723616 | 0.741624 |
| 0 | RandomForest | 0.787033 | 0.793170 |
| 0 | Bagging | 0.768102 | 0.770296 |
| 0 | AdaBoostClassifier | 0.800757 | 0.804124 |
| 0 | GradientBoostingClassifier | 0.809749 | 0.808956 |
| 0 | XGB | 0.784193 | 0.803157 |
| 0 | CatBoost | 0.801704 | 0.819588 |
| 0 | Naive Bayes | 0.752485 | 0.783183 |
| 0 | KNeighbours | 0.774728 | 0.736469 |
| 0 | SVM | 0.792239 | 0.757732 |
| 0 | LogisticRegression | 0.805017 | 0.789304 |
# Checking the accuracy of each model where the target data is balanced using Cluster Centroids
for models in classifiers:
classification(models[1], models[0], 'Cluster Centriods')
For DecisionTree ---->
Training Accuracy: 1.0
Testing Accuracy: 0.7388591800356507
ROC AUC Score: 0.7388591800356507
Classification Report:
precision recall f1-score support
0 0.74 0.74 0.74 561
1 0.74 0.74 0.74 561
accuracy 0.74 1122
macro avg 0.74 0.74 0.74 1122
weighted avg 0.74 0.74 0.74 1122
Confusion Matrics:
No Yes
No 416 145
Yes 148 413
No Yes
No 416 145
Yes 148 413
For RandomForest ---->
Training Accuracy: 1.0
Testing Accuracy: 0.8163992869875223
ROC AUC Score: 0.8163992869875223
Classification Report:
precision recall f1-score support
0 0.80 0.84 0.82 561
1 0.83 0.80 0.81 561
accuracy 0.82 1122
macro avg 0.82 0.82 0.82 1122
weighted avg 0.82 0.82 0.82 1122
Confusion Matrics:
No Yes
No 469 92
Yes 114 447
No Yes
No 469 92
Yes 114 447
For Bagging ---->
Training Accuracy: 0.9850917431192661
Testing Accuracy: 0.7843137254901961
ROC AUC Score: 0.7843137254901961
Classification Report:
precision recall f1-score support
0 0.77 0.82 0.79 561
1 0.81 0.75 0.78 561
accuracy 0.78 1122
macro avg 0.79 0.78 0.78 1122
weighted avg 0.79 0.78 0.78 1122
Confusion Matrics:
No Yes
No 460 101
Yes 141 420
No Yes
No 460 101
Yes 141 420
For AdaBoostClassifier ---->
Training Accuracy: 0.8226299694189603
Testing Accuracy: 0.8119429590017825
ROC AUC Score: 0.8119429590017826
Classification Report:
precision recall f1-score support
0 0.82 0.80 0.81 561
1 0.81 0.82 0.81 561
accuracy 0.81 1122
macro avg 0.81 0.81 0.81 1122
weighted avg 0.81 0.81 0.81 1122
Confusion Matrics:
No Yes
No 451 110
Yes 101 460
No Yes
No 451 110
Yes 101 460
For GradientBoostingClassifier ---->
Training Accuracy: 0.8581804281345565
Testing Accuracy: 0.8351158645276292
ROC AUC Score: 0.8351158645276293
Classification Report:
precision recall f1-score support
0 0.83 0.84 0.84 561
1 0.84 0.83 0.83 561
accuracy 0.84 1122
macro avg 0.84 0.84 0.84 1122
weighted avg 0.84 0.84 0.84 1122
Confusion Matrics:
No Yes
No 473 88
Yes 97 464
No Yes
No 473 88
Yes 97 464
For XGB ---->
[22:42:39] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Training Accuracy: 0.9931192660550459
Testing Accuracy: 0.8226381461675579
ROC AUC Score: 0.822638146167558
Classification Report:
precision recall f1-score support
0 0.82 0.83 0.82 561
1 0.83 0.81 0.82 561
accuracy 0.82 1122
macro avg 0.82 0.82 0.82 1122
weighted avg 0.82 0.82 0.82 1122
Confusion Matrics:
No Yes
No 467 94
Yes 105 456
No Yes
No 467 94
Yes 105 456
For CatBoost ---->
Training Accuracy: 0.9166666666666666
Testing Accuracy: 0.8324420677361853
ROC AUC Score: 0.8324420677361853
Classification Report:
precision recall f1-score support
0 0.83 0.84 0.83 561
1 0.83 0.83 0.83 561
accuracy 0.83 1122
macro avg 0.83 0.83 0.83 1122
weighted avg 0.83 0.83 0.83 1122
Confusion Matrics:
No Yes
No 469 92
Yes 96 465
No Yes
No 469 92
Yes 96 465
For Naive Bayes ---->
Training Accuracy: 0.7511467889908257
Testing Accuracy: 0.7566844919786097
ROC AUC Score: 0.7566844919786098
Classification Report:
precision recall f1-score support
0 0.80 0.69 0.74 561
1 0.73 0.82 0.77 561
accuracy 0.76 1122
macro avg 0.76 0.76 0.76 1122
weighted avg 0.76 0.76 0.76 1122
Confusion Matrics:
No Yes
No 388 173
Yes 100 461
No Yes
No 388 173
Yes 100 461
For KNeighbours ---->
Training Accuracy: 0.7928134556574924
Testing Accuracy: 0.749554367201426
ROC AUC Score: 0.7495543672014261
Classification Report:
precision recall f1-score support
0 0.74 0.76 0.75 561
1 0.76 0.74 0.75 561
accuracy 0.75 1122
macro avg 0.75 0.75 0.75 1122
weighted avg 0.75 0.75 0.75 1122
Confusion Matrics:
No Yes
No 428 133
Yes 148 413
No Yes
No 428 133
Yes 148 413
For SVM ---->
Training Accuracy: 0.7412079510703364
Testing Accuracy: 0.7406417112299465
ROC AUC Score: 0.7406417112299465
Classification Report:
precision recall f1-score support
0 0.75 0.72 0.74 561
1 0.73 0.76 0.74 561
accuracy 0.74 1122
macro avg 0.74 0.74 0.74 1122
weighted avg 0.74 0.74 0.74 1122
Confusion Matrics:
No Yes
No 406 155
Yes 136 425
No Yes
No 406 155
Yes 136 425
For LogisticRegression ---->
Training Accuracy: 0.7836391437308868
Testing Accuracy: 0.7887700534759359
ROC AUC Score: 0.7887700534759358
Classification Report:
precision recall f1-score support
0 0.80 0.78 0.79 561
1 0.78 0.80 0.79 561
accuracy 0.79 1122
macro avg 0.79 0.79 0.79 1122
weighted avg 0.79 0.79 0.79 1122
Confusion Matrics:
No Yes
No 436 125
Yes 112 449
No Yes
No 436 125
Yes 112 449
# Lets check the accuracy of each model stored in the resultsDf
resultsDf
| Method | Accuracy | Accuracy(SMOTE) | Accuracy(CC) | |
|---|---|---|---|---|
| 0 | DecisionTree | 0.723616 | 0.741624 | 0.738859 |
| 0 | RandomForest | 0.787033 | 0.793170 | 0.816399 |
| 0 | Bagging | 0.768102 | 0.770296 | 0.784314 |
| 0 | AdaBoostClassifier | 0.800757 | 0.804124 | 0.811943 |
| 0 | GradientBoostingClassifier | 0.809749 | 0.808956 | 0.835116 |
| 0 | XGB | 0.784193 | 0.803157 | 0.822638 |
| 0 | CatBoost | 0.801704 | 0.819588 | 0.832442 |
| 0 | Naive Bayes | 0.752485 | 0.783183 | 0.756684 |
| 0 | KNeighbours | 0.774728 | 0.736469 | 0.749554 |
| 0 | SVM | 0.792239 | 0.757732 | 0.740642 |
| 0 | LogisticRegression | 0.805017 | 0.789304 | 0.788770 |
Decision Tree
#Using the Decision Tree as a model
model= DecisionTreeClassifier()
model.fit(Xcc_train, ycc_train)
ycc_predict= model.predict(Xcc_test)
print('Training Accuracy: ', model.score(Xcc_train, ycc_train))
print('Testing Accuracy: ', model.score(Xcc_test, ycc_test))
Training Accuracy: 1.0 Testing Accuracy: 0.7308377896613191
It show that it is an over fit model where training score much better that the testing. Let's visualize the decision tree
Let's Visualize how the tree looks without any parameter tuning
train_char_label = ['No', 'Yes']
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(model, feature_names=list(Xcc_train), class_names = list(train_char_label), filled=True)
plt.title('Decision Tree Classifier')
Text(0.5, 1.0, 'Decision Tree Classifier')
It can be clearly visualized how overfit the model is.
Lets see after pruning the Dtree
m2 = DecisionTreeClassifier(criterion = 'entropy',max_depth=3, random_state=1)
m2.fit(Xcc_train,ycc_train)
print('Traing Accuracy :' , m2.score(Xcc_train, ycc_train))
print('Testing Accuracy :' , m2.score(Xcc_test, ycc_test))
train_char_label = ['No', 'Yes']
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(m2, feature_names=list(Xcc_train), class_names = list(train_char_label), filled=True)
plt.title('Decision Tree(Pruned) Classifier')
Traing Accuracy : 0.7549694189602446 Testing Accuracy : 0.7629233511586453
Text(0.5, 1.0, 'Decision Tree(Pruned) Classifier')
Let's try to find out the best parameter to be implemented for the decision Tree. we shall be using 5 number of cross-validation you have to try for each selected set of hyperparameters
param = {'max_depth': range(2, 11), 'criterion': ['entropy', 'gini'],
'random_state': range(1,30) } #Parameters for parameter tuning using the Grid Search
best_dt = GridSearchCV(DecisionTreeClassifier(),param, cv=5)
best_dt.fit(Xcc_train,ycc_train)
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
param_grid={'criterion': ['entropy', 'gini'],
'max_depth': range(2, 11),
'random_state': range(1, 30)})
print("Best parameter:", best_dt.best_params_)
print("Best cross validaton score", best_dt.best_score_)
Best parameter: {'criterion': 'entropy', 'max_depth': 7, 'random_state': 7}
Best cross validaton score 0.767966663260987
The score of the model increases with the parameter tuning.
tuned_dt = DecisionTreeClassifier(criterion = 'entropy', max_depth=7, random_state=7)
tuned_dt.fit(Xcc_train,ycc_train)
y_predict= tuned_dt.predict(Xcc_test)
acc= tuned_dt.score(Xcc_test,ycc_test)
print(f'Training Score : {tuned_dt.score(Xcc_train,ycc_train) * 100:.2f}' )
print(f'Testing Score : {tuned_dt.score(Xcc_test,ycc_test) * 100:.2f}' )
cm=confusion_m(y_predict, 'CC')
plt.figure(figsize = (7,5))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
Training Score : 79.78
Testing Score : 77.36
Classification Report:
precision recall f1-score support
0 0.75 0.83 0.79 561
1 0.81 0.72 0.76 561
accuracy 0.77 1122
macro avg 0.78 0.77 0.77 1122
weighted avg 0.78 0.77 0.77 1122
Confusion Matrics:
No Yes
No 466 95
Yes 159 402
<AxesSubplot:>
# Dataframe to store the accuracy of all the tuned models
tuned_acc= pd.DataFrame(columns=['Method', 'Accuracy'])
tuned_acc = pd.DataFrame({'Method':['Decision Tree'], 'Accuracy': [acc]})
# lets check the feature importance
imp = pd.Series(data=tuned_dt.feature_importances_, index=Xcc_train.columns).sort_values(ascending=False)
plt.figure(figsize=(10,12))
plt.title("Feature importance")
ax = sns.barplot(y=imp.index, x=imp.values, palette="Blues_d", orient='h')
# Let's see the FP vs TP rate for the tuned model
probs = tuned_dt.predict_proba(Xcc_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(ycc_test, y_predict)
roc_auc = metrics.auc(fpr,tpr)
plt.figure(dpi=100)
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.title('Decision Tree')
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
Bagging
# using bagging on the tuned decision tree
bgcl = BaggingClassifier(base_estimator=tuned_dt,n_estimators=20,max_features=9,random_state=1).fit(Xcc_train, ycc_train)
bgcl.fit(Xcc_train, ycc_train)
bgcl_pred=bgcl.predict(Xcc_test)
print('Training Score', bgcl.score(Xcc_train,ycc_train))
print('Testing Score', bgcl.score(Xcc_test,ycc_test))
Training Score 0.8287461773700305 Testing Score 0.8092691622103387
cm= confusion_m(bgcl_pred, 'CC')
plt.figure(figsize = (7,5))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
Classification Report:
precision recall f1-score support
0 0.82 0.79 0.81 561
1 0.80 0.83 0.81 561
accuracy 0.81 1122
macro avg 0.81 0.81 0.81 1122
weighted avg 0.81 0.81 0.81 1122
Confusion Matrics:
No Yes
No 445 116
Yes 98 463
<AxesSubplot:>
Applying Bagging on the decision tree helps improve the accuracy
tempResultsDf = pd.DataFrame({'Method':['Bagging'], 'Accuracy': [bgcl.score(Xcc_test, ycc_test)]})
tuned_acc = pd.concat([tuned_acc, tempResultsDf])
tuned_acc
| Method | Accuracy | |
|---|---|---|
| 0 | Decision Tree | 0.773619 |
| 0 | Bagging | 0.809269 |
Random Forest
rfcl = RandomForestClassifier(random_state=1)
rfcl = rfcl.fit(Xcc_train, ycc_train)
y_predict1= rfcl.predict(Xcc_test)
print('Training Accuracy: ', rfcl.score(Xcc_train, ycc_train))
print('Testing Accuracy: ', rfcl.score(Xcc_test, ycc_test))
Training Accuracy: 1.0 Testing Accuracy: 0.8190730837789661
# Checking the default paramters passed
print(rfcl.get_params())
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}
imp = pd.Series(data=rfcl.feature_importances_, index=Xcc_train.columns).sort_values(ascending=False)
plt.figure(figsize=(10,12))
plt.title("Feature importance")
ax = sns.barplot(y=imp.index, x=imp.values, palette="Blues_d", orient='h')
# Applying for Parameter tuning to find the best model
n_estimators = range(1,100)
max_features = ['auto', 'sqrt']
max_depth = range(1,100)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
params = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
rf = RandomForestClassifier(random_state = 42)
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=params,
n_iter = 100, scoring='neg_mean_absolute_error',
cv = 3, verbose=2, random_state=42, n_jobs=-1,
return_train_score=True)
rf_random.fit(Xcc_train, ycc_train)
Fitting 3 folds for each of 100 candidates, totalling 300 fits
RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(random_state=42),
n_iter=100, n_jobs=-1,
param_distributions={'bootstrap': [True, False],
'max_depth': range(1, 100),
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10],
'n_estimators': range(1, 100)},
random_state=42, return_train_score=True,
scoring='neg_mean_absolute_error', verbose=2)
print('The tuned parameters are: ', rf_random.best_params_)
The tuned parameters are: {'n_estimators': 73, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 99, 'bootstrap': True}
rfc = RandomForestClassifier(n_estimators = 73, max_depth=18, min_samples_split=10, random_state = 1, bootstrap='False',
max_features='auto', min_samples_leaf= 2)
rfc.fit(Xcc_train, ycc_train)
yrfc_predict= rfc.predict(Xcc_test)
acc= rfc.score(Xcc_test, ycc_test)
print('Training Accuracy: ', rfc.score(Xcc_train, ycc_train))
print('Testing Accuracy: ', rfc.score(Xcc_test, ycc_test))
Training Accuracy: 0.8975535168195719 Testing Accuracy: 0.8163992869875223
cm=confusion_m(yrfc_predict, 'CC')
plt.figure(figsize = (7,5))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
Classification Report:
precision recall f1-score support
0 0.81 0.83 0.82 561
1 0.82 0.81 0.81 561
accuracy 0.82 1122
macro avg 0.82 0.82 0.82 1122
weighted avg 0.82 0.82 0.82 1122
Confusion Matrics:
No Yes
No 463 98
Yes 108 453
<AxesSubplot:>
probs = rfc.predict_proba(Xcc_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(ycc_test, yrfc_predict)
roc_auc = metrics.auc(fpr,tpr)
plt.figure(dpi=100)
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.title('Random Forest')
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
tempResultsDf = pd.DataFrame({'Method':['Random Forest'], 'Accuracy': [acc]})
tuned_acc = pd.concat([tuned_acc, tempResultsDf])
tuned_acc
| Method | Accuracy | |
|---|---|---|
| 0 | Decision Tree | 0.773619 |
| 0 | Bagging | 0.809269 |
| 0 | Random Forest | 0.816399 |
XGBoost
xgb = XGBClassifier(eval_metric='mlogloss')
xgb.fit(Xcc_train, ycc_train)
yxgb_predict= xgb.predict(Xcc_test)
print('Training Accuracy :', xgb.score(Xcc_train, ycc_train))
print('Testing Accuracy :', xgb.score(Xcc_test, ycc_test))
Training Accuracy : 0.9931192660550459 Testing Accuracy : 0.8226381461675579
param_grid = {'n_estimators': range(0,1000,25)}
# find the best parameter
grid_search = GridSearchCV(xgb, param_grid, scoring='recall')
grid_result = grid_search.fit(Xcc_train, ycc_train)
print(f'Best result: {grid_result.best_score_} for {grid_result.best_params_}')
Best result: 0.8119358895615804 for {'n_estimators': 25}
xgb = XGBClassifier(n_estimators= 25, eval_metric='mlogloss')
xgb.fit(Xcc_train, ycc_train)
yxgb_predict= xgb.predict(Xcc_test)
print('Training Accuracy :', xgb.score(Xcc_train, ycc_train))
print('Testing Accuracy :', xgb.score(Xcc_test, ycc_test))
param_grid = {'max_depth': range(1,8,1)}
# find the best parameter
grid_search = GridSearchCV(xgb, param_grid, scoring='recall')
grid_result = grid_search.fit(Xcc_train, ycc_train)
print(f'Best result: {grid_result.best_score_} for {grid_result.best_params_}')
Training Accuracy : 0.9216360856269113
Testing Accuracy : 0.8199643493761141
Best result: 0.821084495919979 for {'max_depth': 4}
xgb = XGBClassifier(n_estimators= 25, max_depth=4, eval_metric='mlogloss')
xgb.fit(Xcc_train, ycc_train)
yxgb_predict= xgb.predict(Xcc_test)
print('Training Accuracy :', xgb.score(Xcc_train, ycc_train))
print('Testing Accuracy :', xgb.score(Xcc_test, ycc_test))
param_grid = {'min_child_weight': np.arange(0.0001, 0.5, 0.001)}
# find the best parameter
grid_search = GridSearchCV(xgb, param_grid, scoring='recall')
grid_result = grid_search.fit(Xcc_train, ycc_train)
print(f'Best result: {grid_result.best_score_} for {grid_result.best_params_}')
Training Accuracy : 0.8654434250764526
Testing Accuracy : 0.8288770053475936
Best result: 0.8241350062882045 for {'min_child_weight': 0.3671}
xgb_tuned = XGBClassifier(n_estimators= 25, max_depth=4, min_child_weight=0.3671, eval_metric='mlogloss')
xgb_tuned.fit(Xcc_train, ycc_train)
yxgbt_predict= xgb_tuned.predict(Xcc_test)
print('Training Accuracy :', xgb_tuned.score(Xcc_train, ycc_train))
print('Testing Accuracy :', xgb_tuned.score(Xcc_test, ycc_test))
Training Accuracy : 0.8512996941896025 Testing Accuracy : 0.8360071301247772
cm=confusion_m(yxgbt_predict, 'CC')
plt.figure(figsize = (7,5))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
Classification Report:
precision recall f1-score support
0 0.83 0.85 0.84 561
1 0.85 0.82 0.83 561
accuracy 0.84 1122
macro avg 0.84 0.84 0.84 1122
weighted avg 0.84 0.84 0.84 1122
Confusion Matrics:
No Yes
No 477 84
Yes 100 461
<AxesSubplot:>
probs = xgb_tuned.predict_proba(Xcc_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(ycc_test, yxgbt_predict)
roc_auc = metrics.auc(fpr,tpr)
plt.figure(dpi=100)
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.title('XG Boost')
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
tempResultsDf = pd.DataFrame({'Method':['XGBoost'], 'Accuracy': [xgb_tuned.score(Xcc_test, ycc_test)]})
tuned_acc = pd.concat([tuned_acc, tempResultsDf])
tuned_acc
| Method | Accuracy | |
|---|---|---|
| 0 | Decision Tree | 0.773619 |
| 0 | Bagging | 0.809269 |
| 0 | Random Forest | 0.816399 |
| 0 | XGBoost | 0.836007 |
CatBoost
cb = CatBoostClassifier(logging_level='Silent')
cb.fit(Xcc_train, ycc_train)
print('Training Accuracy :' , cb.score(Xcc_train, ycc_train))
print('Testing Accuracy: ', cb.score(Xcc_test, ycc_test))
print(cb.get_all_params())
Training Accuracy : 0.9166666666666666
Testing Accuracy: 0.8324420677361853
{'nan_mode': 'Min', 'eval_metric': 'Logloss', 'iterations': 1000, 'sampling_frequency': 'PerTree', 'leaf_estimation_method': 'Newton', 'grow_policy': 'SymmetricTree', 'penalties_coefficient': 1, 'boosting_type': 'Plain', 'model_shrink_mode': 'Constant', 'feature_border_type': 'GreedyLogSum', 'bayesian_matrix_reg': 0.10000000149011612, 'l2_leaf_reg': 3, 'random_strength': 1, 'rsm': 1, 'boost_from_average': False, 'model_size_reg': 0.5, 'pool_metainfo_options': {'tags': {}}, 'subsample': 0.800000011920929, 'use_best_model': False, 'class_names': [0, 1], 'random_seed': 0, 'depth': 6, 'posterior_sampling': False, 'border_count': 254, 'classes_count': 0, 'auto_class_weights': 'None', 'sparse_features_conflict_fraction': 0, 'leaf_estimation_backtracking': 'AnyImprovement', 'best_model_min_trees': 1, 'model_shrink_rate': 0, 'min_data_in_leaf': 1, 'loss_function': 'Logloss', 'learning_rate': 0.015533000230789185, 'score_function': 'Cosine', 'task_type': 'CPU', 'leaf_estimation_iterations': 10, 'bootstrap_type': 'MVS', 'max_leaves': 64}
grid = {'n_estimators': [50, 100, 200, 300],
"learning_rate": np.linspace(0,0.2,2),
"max_depth": [3,4,5]}
g_search= GridSearchCV(estimator = cb, param_grid = grid, scoring ='accuracy', cv = 5)
g_result = g_search.fit(Xcc_train, ycc_train)
print(f'Best result: {g_result.best_score_} for {g_result.best_params_}')
Best result: 0.8157473764103162 for {'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 100}
cb_tuned = CatBoostClassifier(learning_rate= 0.2, max_depth= 4, n_estimators= 100, logging_level='Silent')
cb_tuned.fit(Xcc_train, ycc_train)
ycb_predict= cb_tuned.predict(Xcc_test)
print('Training Accuracy :', cb_tuned.score(Xcc_train, ycc_train))
print('Testing Accuracy :', cb_tuned.score(Xcc_test, ycc_test))
Training Accuracy : 0.8581804281345565 Testing Accuracy : 0.8342245989304813
cm=confusion_m(ycb_predict, 'CC')
plt.figure(figsize = (7,5))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
Classification Report:
precision recall f1-score support
0 0.84 0.83 0.83 561
1 0.83 0.84 0.83 561
accuracy 0.83 1122
macro avg 0.83 0.83 0.83 1122
weighted avg 0.83 0.83 0.83 1122
Confusion Matrics:
No Yes
No 466 95
Yes 91 470
<AxesSubplot:>
imp = pd.Series(data=cb_tuned.get_feature_importance(), index=Xcc_train.columns).sort_values(ascending=False)
plt.figure(figsize=(10,12))
plt.title("Feature importance")
ax = sns.barplot(y=imp.index, x=imp.values, palette="Blues_d", orient='h')
probs = cb_tuned.predict_proba(Xcc_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(ycc_test, yxgbt_predict)
roc_auc = metrics.auc(fpr,tpr)
plt.figure(dpi=100)
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.title('CatBoost')
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
tempResultsDf = pd.DataFrame({'Method':['CatBoost'], 'Accuracy': [cb_tuned.score(Xcc_test, ycc_test)]})
tuned_acc = pd.concat([tuned_acc, tempResultsDf])
tuned_acc
| Method | Accuracy | |
|---|---|---|
| 0 | Decision Tree | 0.773619 |
| 0 | Bagging | 0.809269 |
| 0 | Random Forest | 0.816399 |
| 0 | XGBoost | 0.836007 |
| 0 | CatBoost | 0.834225 |
AdaBoost
abcl = AdaBoostClassifier(n_estimators=10, random_state=1)
#abcl = AdaBoostClassifier( n_estimators=50,random_state=1)
abcl = abcl.fit(Xcc_train, ycc_train)
yab_predict= abcl.predict(Xcc_test)
print('Training Accuracy :' , abcl.score(Xcc_train, ycc_train))
print('Testing Accuracy: ', abcl.score(Xcc_test, ycc_test))
Training Accuracy : 0.7966360856269113 Testing Accuracy: 0.7932263814616756
cm=confusion_m(yab_predict, 'CC')
plt.figure(figsize = (7,5))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
Classification Report:
precision recall f1-score support
0 0.79 0.79 0.79 561
1 0.79 0.79 0.79 561
accuracy 0.79 1122
macro avg 0.79 0.79 0.79 1122
weighted avg 0.79 0.79 0.79 1122
Confusion Matrics:
No Yes
No 445 116
Yes 116 445
<AxesSubplot:>
tempResultsDf = pd.DataFrame({'Method':['AdaBoost'], 'Accuracy': [abcl.score(Xcc_test, ycc_test)]})
tuned_acc = pd.concat([tuned_acc, tempResultsDf])
tuned_acc
| Method | Accuracy | |
|---|---|---|
| 0 | Decision Tree | 0.773619 |
| 0 | Bagging | 0.809269 |
| 0 | Random Forest | 0.816399 |
| 0 | XGBoost | 0.836007 |
| 0 | CatBoost | 0.834225 |
| 0 | AdaBoost | 0.793226 |
Gradient Boosting
gbc=GradientBoostingClassifier(learning_rate=0.1)
gbc.fit(Xcc_train, ycc_train)
ygbc_predict= gbc.predict(X_test)
print('Training Accuracy :' , gbc.score(Xcc_train, ycc_train))
print('Testing Accuracy: ', gbc.score(Xcc_test, ycc_test))
Training Accuracy : 0.8581804281345565 Testing Accuracy: 0.8351158645276292
print('Default Paramter\n',gbc.get_params())
Default Paramter
{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
param = {'n_estimators': [50, 100, 200, 300],
"learning_rate": np.linspace(0,0.2,2),
"max_depth": [3,4,5]}
g_search= GridSearchCV(estimator = gbc, param_grid = param, scoring ='accuracy', cv = 5)
g_result = g_search.fit(Xcc_train, ycc_train)
print(f'Best result: {g_result.best_score_} for {g_result.best_params_}')
Best result: 0.8096346678732502 for {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100}
gbc = GradientBoostingClassifier(learning_rate=0.2, n_estimators= 100, max_depth=3)
gbc.fit(Xcc_train, ycc_train)
ygbc_predict= gbc.predict(Xcc_test)
print('Training Accuracy :', gbc.score(Xcc_train, ycc_train))
print('Testing Accuracy :', gbc.score(Xcc_test, ycc_test))
param_grid = {'min_samples_split':range(200,1001,200),
'min_samples_leaf':range(30,71,10)}
# find the best parameter
grid_search = GridSearchCV(gbc, param_grid, scoring='accuracy', cv=5)
grid_result = grid_search.fit(Xcc_train, ycc_train)
print(f'Best result: {grid_result.best_score_} for {grid_result.best_params_}')
Training Accuracy : 0.9204892966360856
Testing Accuracy : 0.839572192513369
Best result: 0.8184235108665507 for {'min_samples_leaf': 60, 'min_samples_split': 600}
gbc_tuned = GradientBoostingClassifier(learning_rate=0.2, n_estimators= 100, max_depth=3, min_samples_leaf= 60,
min_samples_split=600 , random_state=1)
gbc_tuned.fit(Xcc_train, ycc_train)
ygbct_predict= gbc_tuned.predict(Xcc_test)
print('Training Accuracy :', gbc_tuned.score(Xcc_train, ycc_train))
print('Testing Accuracy :', gbc_tuned.score(Xcc_test, ycc_test))
Training Accuracy : 0.8642966360856269 Testing Accuracy : 0.8431372549019608
cm=confusion_m(ygbct_predict, 'CC')
plt.figure(figsize = (7,5))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
Classification Report:
precision recall f1-score support
0 0.84 0.84 0.84 561
1 0.84 0.84 0.84 561
accuracy 0.84 1122
macro avg 0.84 0.84 0.84 1122
weighted avg 0.84 0.84 0.84 1122
Confusion Matrics:
No Yes
No 472 89
Yes 87 474
<AxesSubplot:>
probs = gbc_tuned.predict_proba(Xcc_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(ycc_test, yxgbt_predict)
roc_auc = metrics.auc(fpr,tpr)
plt.figure(dpi=100)
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.title('CatBoost')
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
tempResultsDf = pd.DataFrame({'Method':['Gradient Boost'], 'Accuracy': [gbc_tuned.score(Xcc_test, ycc_test)]})
tuned_acc = pd.concat([tuned_acc, tempResultsDf])
tuned_acc
| Method | Accuracy | |
|---|---|---|
| 0 | Decision Tree | 0.773619 |
| 0 | Bagging | 0.809269 |
| 0 | Random Forest | 0.816399 |
| 0 | XGBoost | 0.836007 |
| 0 | CatBoost | 0.834225 |
| 0 | AdaBoost | 0.793226 |
| 0 | Gradient Boost | 0.843137 |
Pickle Model
The best model turned out to be GradientBoostingClassifier. Hence let's save it for further use.
import pickle
best_model=GradientBoostingClassifier(learning_rate=0.2, n_estimators= 200, max_depth=3, min_samples_leaf= 30,
min_samples_split=1000,random_state=1 )
best_model.fit(Xcc_train,ycc_train)
# Save the trained model as a pickle string.
saved_model = pickle.dumps(best_model)
# Save to file in the current working directory
pkl_filename = "pickle_model.pkl"
with open(pkl_filename, 'wb') as file:
pickle.dump(best_model, file)
# Load from file
with open(pkl_filename, 'rb') as file:
pickle_model = pickle.load(file)
# Calculate the accuracy score and predict target values
score = pickle_model.score(Xcc_test, ycc_test)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_model.predict(Xcc_test)
Test score: 84.31 %
6. Conclusion and improvisation: